/*
 *   This program is free software: you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation, either version 3 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU General Public License for more details.
 *
 *   You should have received a copy of the GNU General Public License
 *   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

/*
 *    C45Loader.java
 *    Copyright (C) 2000-2012 University of Waikato, Hamilton, New Zealand
 *
 */

package weka.core.converters;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.ArrayList;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Utils;

/**
 * <!-- globalinfo-start --> Reads a file that is C45 format. Can take a
 * filestem or filestem with .names or .data appended. Assumes that
 * path/&lt;filestem&gt;.names and path/&lt;filestem&gt;.data exist and contain
 * the names and data respectively.
 * <p/>
 * <!-- globalinfo-end -->
 * 
 * @author Mark Hall (mhall@cs.waikato.ac.nz)
 * @version $Revision$
 * @see Loader
 */
public class C45Loader extends AbstractFileLoader implements BatchConverter, IncrementalConverter {

    /** for serialization */
    static final long serialVersionUID = 5454329403218219L;

    /** the file extension */
    public static String FILE_EXTENSION = ".names";

    /**
     * Describe variable <code>m_sourceFileData</code> here.
     */
    private File m_sourceFileData = null;

    /**
     * Reader for names file
     */
    private transient Reader m_namesReader = null;

    /**
     * Reader for data file
     */
    private transient Reader m_dataReader = null;

    /**
     * Holds the filestem.
     */
    private String m_fileStem;

    /**
     * Number of attributes in the data (including ignore and label attributes).
     */
    private int m_numAttribs;

    /**
     * Which attributes are ignore or label. These are *not* included in the arff
     * representation.
     */
    private boolean[] m_ignore;

    /**
     * Returns a string describing this attribute evaluator
     * 
     * @return a description of the evaluator suitable for displaying in the
     *         explorer/experimenter gui
     */
    public String globalInfo() {
        return "Reads a file that is C45 format. Can take a filestem or filestem " + "with .names or .data appended. Assumes that path/<filestem>.names and " + "path/<filestem>.data exist and contain the names and data " + "respectively.";
    }

    /**
     * Resets the Loader ready to read a new data set or the same data set again.
     * 
     * @throws IOException if something goes wrong
     */
    @Override
    public void reset() throws IOException {
        m_structure = null;
        setRetrieval(NONE);

        if (m_File != null) {
            setFile(new File(m_File));
        }
    }

    /**
     * Get the file extension used for arff files
     * 
     * @return the file extension
     */
    @Override
    public String getFileExtension() {
        return FILE_EXTENSION;
    }

    /**
     * Gets all the file extensions used for this type of file
     * 
     * @return the file extensions
     */
    @Override
    public String[] getFileExtensions() {
        return new String[] { ".names", ".data" };
    }

    /**
     * Returns a description of the file type.
     * 
     * @return a short file description
     */
    @Override
    public String getFileDescription() {
        return "C4.5 data files";
    }

    /**
     * Resets the Loader object and sets the source of the data set to be the
     * supplied File object.
     * 
     * @param file the source file.
     * @exception IOException if an error occurs
     */
    @Override
    public void setSource(File file) throws IOException {
        m_structure = null;
        setRetrieval(NONE);

        if (file == null) {
            throw new IOException("Source file object is null!");
        }

        String fname = file.getName();
        String fileStem;
        String path = file.getParent();
        if (path != null) {
            path += File.separator;
        } else {
            path = "";
        }
        if (fname.indexOf('.') < 0) {
            fileStem = fname;
            fname += ".names";
        } else {
            fileStem = fname.substring(0, fname.lastIndexOf('.'));
            fname = fileStem + ".names";
        }
        m_fileStem = fileStem;
        file = new File(path + fname);

        m_sourceFile = file;
        try {
            BufferedReader br = new BufferedReader(new FileReader(file));
            m_namesReader = br;
        } catch (FileNotFoundException ex) {
            throw new IOException("File not found : " + (path + fname));
        }

        m_sourceFileData = new File(path + fileStem + ".data");
        try {
            BufferedReader br = new BufferedReader(new FileReader(m_sourceFileData));
            m_dataReader = br;
        } catch (FileNotFoundException ex) {
            throw new IOException("File not found : " + (path + fname));
        }
        m_File = file.getAbsolutePath();
    }

    /**
     * Determines and returns (if possible) the structure (internally the header) of
     * the data set as an empty set of instances.
     * 
     * @return the structure of the data set as an empty set of Instances
     * @exception IOException if an error occurs
     */
    @Override
    public Instances getStructure() throws IOException {
        if (m_sourceFile == null) {
            throw new IOException("No source has beenspecified");
        }

        if (m_structure == null) {
            setSource(m_sourceFile);
            StreamTokenizer st = new StreamTokenizer(m_namesReader);
            initTokenizer(st);
            readHeader(st);
        }

        return m_structure;
    }

    /**
     * Return the full data set. If the structure hasn't yet been determined by a
     * call to getStructure then method should do so before processing the rest of
     * the data set.
     * 
     * @return the structure of the data set as an empty set of Instances
     * @exception IOException if there is no source or parsing fails
     */
    @Override
    public Instances getDataSet() throws IOException {
        if (m_sourceFile == null) {
            throw new IOException("No source has been specified");
        }
        if (getRetrieval() == INCREMENTAL) {
            throw new IOException("Cannot mix getting Instances in both incremental and batch modes");
        }
        setRetrieval(BATCH);
        if (m_structure == null) {
            getStructure();
        }
        StreamTokenizer st = new StreamTokenizer(m_dataReader);
        initTokenizer(st);
        // st.ordinaryChar('.');
        Instances result = new Instances(m_structure);
        Instance current = getInstance(st);

        while (current != null) {
            result.add(current);
            current = getInstance(st);
        }
        try {
            // close the stream
            m_dataReader.close();
            // reset();
        } catch (Exception ex) {
            ex.printStackTrace();
        }
        return result;
    }

    /**
     * Read the data set incrementally---get the next instance in the data set or
     * returns null if there are no more instances to get. If the structure hasn't
     * yet been determined by a call to getStructure then method should do so before
     * returning the next instance in the data set.
     * 
     * If it is not possible to read the data set incrementally (ie. in cases where
     * the data set structure cannot be fully established before all instances have
     * been seen) then an exception should be thrown.
     * 
     * @param structure the dataset header information, will get updated in case of
     *                  string or relational attributes
     * @return the next instance in the data set as an Instance object or null if
     *         there are no more instances to be read
     * @exception IOException if there is an error during parsing
     */
    @Override
    public Instance getNextInstance(Instances structure) throws IOException {
        if (m_sourceFile == null) {
            throw new IOException("No source has been specified");
        }

        if (getRetrieval() == BATCH) {
            throw new IOException("Cannot mix getting Instances in both incremental and batch modes");
        }
        setRetrieval(INCREMENTAL);

        if (m_structure == null) {
            getStructure();
        }

        StreamTokenizer st = new StreamTokenizer(m_dataReader);
        initTokenizer(st);
        // st.ordinaryChar('.');
        Instance nextI = getInstance(st);
        if (nextI != null) {
            nextI.setDataset(m_structure);
        } else {
            try {
                // close the stream
                m_dataReader.close();
                // reset();
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }
        return nextI;
    }

    /**
     * Reads an instance using the supplied tokenizer.
     * 
     * @param tokenizer the tokenizer to use
     * @return an Instance or null if there are no more instances to read
     * @exception IOException if an error occurs
     */
    private Instance getInstance(StreamTokenizer tokenizer) throws IOException {
        double[] instance = new double[m_structure.numAttributes()];

        StreamTokenizerUtils.getFirstToken(tokenizer);
        if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
            return null;
        }

        int counter = 0;
        for (int i = 0; i < m_numAttribs; i++) {
            if (i > 0) {
                StreamTokenizerUtils.getToken(tokenizer);
            }

            if (!m_ignore[i]) {
                // Check if value is missing.
                if (tokenizer.ttype == '?') {
                    instance[counter++] = Utils.missingValue();
                } else {
                    String val = tokenizer.sval;

                    if (i == m_numAttribs - 1) {
                        // remove trailing period
                        if (val.charAt(val.length() - 1) == '.') {
                            val = val.substring(0, val.length() - 1);
                        }
                    }
                    if (m_structure.attribute(counter).isNominal()) {
                        int index = m_structure.attribute(counter).indexOfValue(val);
                        if (index == -1) {
                            StreamTokenizerUtils.errms(tokenizer, "nominal value not declared in " + "header :" + val + " column " + i);
                        }
                        instance[counter++] = index;
                    } else if (m_structure.attribute(counter).isNumeric()) {
                        try {
                            instance[counter++] = Double.valueOf(val).doubleValue();
                        } catch (NumberFormatException e) {
                            StreamTokenizerUtils.errms(tokenizer, "number expected");
                        }
                    } else {
                        System.err.println("Shouldn't get here");
                        System.exit(1);
                    }
                }
            }
        }

        return new DenseInstance(1.0, instance);
    }

    /**
     * removes the trailing period
     * 
     * @param val the string to work on
     * @return the processed string
     */
    private String removeTrailingPeriod(String val) {
        // remove trailing period
        if (val.charAt(val.length() - 1) == '.') {
            val = val.substring(0, val.length() - 1);
        }
        return val;
    }

    /**
     * Reads header (from the names file) using the supplied tokenizer
     * 
     * @param tokenizer the tokenizer to use
     * @exception IOException if an error occurs
     */
    private void readHeader(StreamTokenizer tokenizer) throws IOException {

        ArrayList<Attribute> attribDefs = new ArrayList<Attribute>();
        ArrayList<Integer> ignores = new ArrayList<Integer>();
        StreamTokenizerUtils.getFirstToken(tokenizer);
        if (tokenizer.ttype == StreamTokenizer.TT_EOF) {
            StreamTokenizerUtils.errms(tokenizer, "premature end of file");
        }

        m_numAttribs = 1;
        // Read the class values
        ArrayList<String> classVals = new ArrayList<String>();
        while (tokenizer.ttype != StreamTokenizer.TT_EOL) {
            String val = tokenizer.sval.trim();

            if (val.length() > 0) {
                val = removeTrailingPeriod(val);
                classVals.add(val);
            }
            StreamTokenizerUtils.getToken(tokenizer);
        }

        // read the attribute names and types
        int counter = 0;
        while (tokenizer.ttype != StreamTokenizer.TT_EOF) {
            StreamTokenizerUtils.getFirstToken(tokenizer);
            if (tokenizer.ttype != StreamTokenizer.TT_EOF) {

                String attribName = tokenizer.sval;

                StreamTokenizerUtils.getToken(tokenizer);
                if (tokenizer.ttype == StreamTokenizer.TT_EOL) {
                    StreamTokenizerUtils.errms(tokenizer, "premature end of line. Expected " + "attribute type.");
                }
                String temp = tokenizer.sval.toLowerCase().trim();
                if (temp.startsWith("ignore") || temp.startsWith("label")) {
                    ignores.add(new Integer(counter));
                    counter++;
                } else if (temp.startsWith("continuous")) {
                    attribDefs.add(new Attribute(attribName));
                    counter++;
                } else {
                    counter++;
                    // read the values of the attribute
                    ArrayList<String> attribVals = new ArrayList<String>();
                    while (tokenizer.ttype != StreamTokenizer.TT_EOL && tokenizer.ttype != StreamTokenizer.TT_EOF) {
                        String val = tokenizer.sval.trim();

                        if (val.length() > 0) {
                            val = removeTrailingPeriod(val);
                            attribVals.add(val);
                        }
                        StreamTokenizerUtils.getToken(tokenizer);
                    }
                    attribDefs.add(new Attribute(attribName, attribVals));
                }
            }
        }

        boolean ok = true;
        int i = -1;
        if (classVals.size() == 1) {
            // look to see if this is an attribute name (ala c5 names file style)
            for (i = 0; i < attribDefs.size(); i++) {
                if (attribDefs.get(i).name().compareTo(classVals.get(0)) == 0) {
                    ok = false;
                    m_numAttribs--;
                    break;
                }
            }
        }

        if (ok) {
            attribDefs.add(new Attribute("Class", classVals));
        }

        m_structure = new Instances(m_fileStem, attribDefs, 0);

        try {
            if (ok) {
                m_structure.setClassIndex(m_structure.numAttributes() - 1);
            } else {
                m_structure.setClassIndex(i);
            }
        } catch (Exception ex) {
            ex.printStackTrace();
        }

        m_numAttribs = m_structure.numAttributes() + ignores.size();
        m_ignore = new boolean[m_numAttribs];
        for (i = 0; i < ignores.size(); i++) {
            m_ignore[ignores.get(i).intValue()] = true;
        }
    }

    /**
     * Initializes the stream tokenizer
     * 
     * @param tokenizer the tokenizer to initialize
     */
    private void initTokenizer(StreamTokenizer tokenizer) {
        tokenizer.resetSyntax();
        tokenizer.whitespaceChars(0, (' ' - 1));
        tokenizer.wordChars(' ', '\u00FF');
        tokenizer.whitespaceChars(',', ',');
        tokenizer.whitespaceChars(':', ':');
        // tokenizer.whitespaceChars('.','.');
        tokenizer.commentChar('|');
        tokenizer.whitespaceChars('\t', '\t');
        tokenizer.quoteChar('"');
        tokenizer.quoteChar('\'');
        tokenizer.eolIsSignificant(true);
    }

    /**
     * Main method for testing this class.
     * 
     * @param args should contain &lt;filestem&gt;[.names | data]
     */
    public static void main(String[] args) {
        runFileLoader(new C45Loader(), args);
    }
}
